What does one of the world’s most famous bikeshares look like, on one of its city’s wildest nights? In this project, I track usage of the Paris Vélib system from 6 PM on Bastille day through to 6 AM the next morning. To do so, I leverage the excellent open data system that the Paris Mayor’s Office has put in place.

#Getting data on where stations are 
station_data2<- fromJSON("https://velib-metropole-opendata.smoove.pro/opendata/Velib_Metropole/station_information.json")

station_data <- as_tibble(station_data2$data$stations)
station_data <- station_data %>% 
  select(-stationCode, -rental_methods)
write.csv(station_data, "nyt_climate_data/station_data.csv")

As mentioned, I ran this code over a specific timeframe. I have therefore opted to not have this code evaluated.

#Creating a function that takes a number of times for the machine to collect data, as well as the rest time in between in seconds
velib_night_function <- function(i, s){
  message(Sys.time())  
  json <- fromJSON(content(GET("https://velib-metropole-opendata.smoove.pro/opendata/Velib_Metropole/station_status.json"), "text"))
    velib_availab1 <- (json)
    velib_prev <- velib_availab1$data$stations

    velib_prev <- velib_prev %>% 
      select(station_id, num_bikes_available, num_docks_available, is_installed, is_renting, is_returning)
    velib_prev <- velib_prev %>% 
      filter(is_installed==1 & is_returning==1 & is_renting==1)
    velib_prev <- velib_prev %>%
      select(-is_installed, -is_renting, -is_returning)
    velib_prev <- rename(velib_prev, "num_bikes_available_prev"=num_bikes_available, "num_docks_available_prev"=num_docks_available)
    Sys.sleep(s)
    a=1

  while (a<=i){
    json <- fromJSON(content(GET("https://velib-metropole-opendata.smoove.pro/opendata/Velib_Metropole/station_status.json"), "text"))
    velib_availab1 <- (json)
    velib_now <- velib_availab1$data$stations

    velib_now <- velib_now %>% 
      select(station_id, num_bikes_available, num_docks_available, is_installed, is_renting, is_returning)
    velib_now <- velib_now %>% 
      filter(is_installed==1 & is_returning==1 & is_renting==1)
    velib_now <- velib_now %>%
      select(-is_installed, -is_renting, -is_returning)
    
    #Generate comparison
    velib_comp <- left_join(velib_prev, velib_now, by="station_id")
    velib_comp$docks_avail_change <- abs(velib_comp$num_docks_available_prev-velib_comp$num_docks_available)
    velib_comp$bikes_avail_change <- abs(velib_comp$num_bikes_available_prev-velib_comp$num_bikes_available)
    velib_comp <- velib_comp %>% 
      select(station_id, docks_avail_change, bikes_avail_change)
    #Adding a time tracker for the time, in Paris (one hour ahead of my system time)
    velib_comp$time_track<- (Sys.time()+hours(1))
    
    #Put just the changes into a new dataframe
    velib_night_tracker <- rbind(velib_night_tracker, velib_comp)

    #Make the now into prev
    velib_prev <-velib_now
    velib_prev <- rename(velib_prev, "num_bikes_available_prev"=num_bikes_available, "num_docks_available_prev"=num_docks_available)
    message(Sys.time())
    a<-a+1
    Sys.sleep(s)
    
  }
  
  return(velib_night_tracker)
}
velib_night_tracker <- data.frame()

#Takes 2 seconds to do, so 58 seconds in between instead of 60
velib_night <- velib_night_function(1500, 58)
write.csv(velib_night, "nyt_climate_data/velib_night.csv")
velib_night <-read.csv("nyt_climate_data/velib_night.csv")
docks_df <- velib_night%>% 
  group_by(station_id)%>%
  summarise(docks_change=sum(docks_avail_change))
bikes_df <- velib_night %>%
  group_by(station_id) %>%
  summarise(bikes_change = sum(bikes_avail_change))
velib_change <- left_join(docks_df, bikes_df, by="station_id")
write.csv(velib_change, "nyt_climate_data/velib_change.csv")


bikes_df %>%
  arrange(desc(bikes_change))%>%
  head()
## # A tibble: 6 × 2
##   station_id bikes_change
##        <dbl>        <int>
## 1  394404659          499
## 2   39149651          464
## 3   66491397          456
## 4  128884445          410
## 5  476155906          399
## 6  100923502          393
docks_df %>%
  arrange(desc(docks_change))%>%
  head()
## # A tibble: 6 × 2
##   station_id docks_change
##        <dbl>        <int>
## 1  394404659          474
## 2   39149651          468
## 3   66491397          452
## 4  128884445          411
## 5   49423507          390
## 6  100923502          390
velib_change <- read.csv("nyt_climate_data/velib_change.csv")
velib_change <- velib_change %>%
  select(-X)
station_data <- read.csv("nyt_climate_data/station_data.csv")
station_data <- station_data %>%
  select(-X)
velib_night <- read.csv("nyt_climate_data/velib_night.csv")
velib_night <- velib_night %>%
  select(-X)



velib_change <- rename(velib_change, "change_station_id"=station_id)
velib_night <- rename(velib_night, "night_station_id"=station_id)


db <- dbConnect(RSQLite::SQLite(), "nyt_climate_data/velib.sqlite")

dbWriteTable(db, "velib_change", velib_change, overwrite=TRUE)
dbWriteTable(db, "station_data", station_data, overwrite=TRUE)
dbWriteTable(db, "velib_night", velib_night, overwrite=TRUE)

dbGetQuery(db, "SELECT *
           FROM station_Data 
           LIMIT 5
           ")
##    station_id                                   name      lat      lon capacity
## 1   213688169          Benjamin Godard - Victor Hugo 48.86598 2.275725       35
## 2   653222953              Mairie de Rosny-sous-Bois 48.87126 2.486581       30
## 3       36255                     Toudouze - Clauzel 48.87930 2.337360       21
## 4    37815204                        Mairie du 12ème 48.84086 2.387555       30
## 5 17486274358 Jean Rostand - Paul Vaillant Couturier 48.90817 2.453060        0
dbGetQuery(db, "SELECT COUNT(*)
           FROM station_Data 
           LIMIT 5
           ")
##   COUNT(*)
## 1     1465
dbGetQuery(db, "SELECT *
           FROM velib_change 
           LIMIT 5
           ")
##   change_station_id docks_change bikes_change
## 1              6245           96           95
## 2              6293           65           65
## 3              6294          149          149
## 4              6295          117          117
## 5              6296          132          126
dbGetQuery(db, "SELECT COUNT(*)
           FROM velib_change 
           LIMIT 5
           ")
##   COUNT(*)
## 1     1419
dbGetQuery(db, "SELECT *
           FROM velib_night 
           LIMIT 5
           ")
##   night_station_id docks_avail_change bikes_avail_change          time_track
## 1        213688169                  0                  0 2023-07-14 13:00:17
## 2        653222953                  0                  0 2023-07-14 13:00:17
## 3            36255                  0                  0 2023-07-14 13:00:17
## 4         37815204                  1                  1 2023-07-14 13:00:17
## 5        251039991                  0                  0 2023-07-14 13:00:17
dbGetQuery(db, "SELECT COUNT(*)
           FROM velib_night 
           LIMIT 5
           ")
##   COUNT(*)
## 1  2110822

Since a small number of stations weren’t in service on Saturday night, the number of rows in tables 1 and 2 are slightly different.

  1. Demonstrate which of the tables can be joined. Return only the first five rows of the joined tables, and also return the total number of rows in the joined tables with a query.
dbGetQuery(db, "SELECT *
           FROM station_data 
           LEFT JOIN (SELECT *
                      FROM velib_change)
           ON station_id=change_station_id
           ORDER BY bikes_change DESC
           LIMIT 5
           ")
##   station_id                                  name      lat      lon capacity
## 1  394404659     Gare Saint-Lazare - Cour du Havre 48.87567 2.326560       45
## 2   39149651 Richard Lenoir - Place de la Bastille 48.85383 2.369722       53
## 3   66491397     Gare de Lyon - Place Louis Armand 48.84552 2.372650       60
## 4  128884445              Constantine - Université 48.86068 2.314848       65
## 5  476155906                 Saint-Antoine Sévigné 48.85502 2.361232       26
##   change_station_id docks_change bikes_change
## 1         394404659          474          499
## 2          39149651          468          464
## 3          66491397          452          456
## 4         128884445          411          410
## 5         476155906          389          399
dbGetQuery(db, "SELECT COUNT(*)
           FROM station_data 
           LEFT JOIN (SELECT *
                      FROM velib_change)
           ON station_id=change_station_id
           ORDER BY bikes_change DESC
           ")
##   COUNT(*)
## 1     1465
dbGetQuery(db, "SELECT *
           FROM velib_night 
           LEFT JOIN (SELECT *
                      FROM station_data)
           ON night_station_id=station_id
           LIMIT 5
           ")
##   night_station_id docks_avail_change bikes_avail_change          time_track
## 1        213688169                  0                  0 2023-07-14 13:00:17
## 2        653222953                  0                  0 2023-07-14 13:00:17
## 3            36255                  0                  0 2023-07-14 13:00:17
## 4         37815204                  1                  1 2023-07-14 13:00:17
## 5        251039991                  0                  0 2023-07-14 13:00:17
##   station_id                          name      lat      lon capacity
## 1  213688169 Benjamin Godard - Victor Hugo 48.86598 2.275725       35
## 2  653222953     Mairie de Rosny-sous-Bois 48.87126 2.486581       30
## 3      36255            Toudouze - Clauzel 48.87930 2.337360       21
## 4   37815204               Mairie du 12ème 48.84086 2.387555       30
## 5  251039991   Cassini - Denfert-Rochereau 48.83753 2.336035       25
dbGetQuery(db, "SELECT COUNT(*)
           FROM velib_night 
           LEFT JOIN (SELECT *
                      FROM station_data)
           ON night_station_id=station_id
           LIMIT 5
           ")
##   COUNT(*)
## 1  2110822
  1. I then go ahead and query the database to generate some fun maps of usage.

This highlights the locations of the top 100 stations that night.

top_100 <- dbGetQuery(db, "SELECT *
           FROM station_Data 
           LEFT JOIN (SELECT change_station_id, bikes_change
                      FROM velib_change)
           ON station_id=change_station_id
           ORDER BY bikes_change DESC
           LIMIT 100
           ")

bbox_paris <- c(2.2, 48.80, 2.5, 48.92)
map <- get_stamenmap(bbox_paris, zoom=12, maptype="terrain")
## ℹ Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
#ggmap(map)

ggmap(map)+
  geom_point(data=top_100, mapping=aes(x=lon, y=lat, color=bikes_change))+
  scale_color_distiller(palette = "YlOrRd", direction = 1)+
  labs(title="Map of the top 100 Vélib stations with highest number of bike changes, \nJuly 14-15", 
       caption="Source: Vélib API \nMap tiles by Stamen Design, under CC BY 3.0. \nMap data by OpenStreetMap, under ODbL", 
       color="Bike changes")+
  theme(axis.line=element_blank(),
      axis.text.x=element_blank(),
      axis.text.y=element_blank(),
      axis.ticks=element_blank(),
      axis.title.x=element_blank(),
      axis.title.y=element_blank())

As one can see, most of the stations that got the most use were downtown and in the East of Paris – which makes sense, this is where the nightlife is, and where people are more likely to bike to get around, as opposed to drive.

Whereas this demonstrates the location of the stations that were not touched once in that period.

zero <- dbGetQuery(db, "SELECT *
           FROM station_Data 
           RIGHT JOIN (SELECT change_station_id, bikes_change
                      FROM velib_change
                      WHERE bikes_change==0)
           ON station_id=change_station_id
           ")
           
ggmap(map)+
  geom_point(data=zero, mapping=aes(x=lon, y=lat))+
  labs(title="Map of the Vélib stations that saw no bike moves, \nSat Jan 14 22:30- Sun Jan 15 1:00", 
       caption="Source: Vélib API, Open Street Maps", 
       color="Bike moves")+
  theme(axis.line=element_blank(),
      axis.text.x=element_blank(),
      axis.text.y=element_blank(),
      axis.ticks=element_blank(),
      axis.title.x=element_blank(),
      axis.title.y=element_blank())     

There are stations with not usage scattered a bit around everywhere, but mainly in the Paris periphery (an additional 42 stations didn’t even make it onto the map).

This next, slightly overwhelming charts presents the data for all the stations in the network (minus a few that don’t fit on my map).

everything <- dbGetQuery(db, "SELECT *
           FROM station_Data 
           LEFT JOIN (SELECT change_station_id, bikes_change
                      FROM velib_change)
           ON station_id=change_station_id
           ")



ggmap(map)+
  geom_point(data=everything, mapping=aes(x=lon, y=lat, color=bikes_change))+
  scale_color_distiller(palette = "YlOrRd", direction = 1)+
  labs(title="Map of the Vélib stations by number of bike moves, \nSat Jan 14 22:30- Sun Jan 15 1:00", 
       caption="Source: Vélib API, Open Street Maps", 
       color="Bike moves")+
  theme(axis.line=element_blank(),
      axis.text.x=element_blank(),
      axis.text.y=element_blank(),
      axis.ticks=element_blank(),
      axis.title.x=element_blank(),
      axis.title.y=element_blank())   
## Warning: Removed 100 rows containing missing values (`geom_point()`).

We can also use the data on the top 20 bike stations, to see what the change patterns looked like over a night.

p<- dbGetQuery(db, "SELECT *
           FROM velib_night 
           WHERE night_station_id IN(SELECT change_station_id
                      FROM velib_change
                      ORDER BY docks_change DESC
                      LIMIT 100)
           ")


summary(p$docks_avail_change)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2102  0.0000 20.0000
p$time_track <- ymd_hms(p$time_track)

p$hour <- hour(p$time_track)

p2 <- p


p3 <- p2 %>% 
  group_by(night_station_id, hour) %>%
  summarise(sum=sum(docks_avail_change))
## `summarise()` has grouped output by 'night_station_id'. You can override using
## the `.groups` argument.
p3$hour <- paste0(as.character(p3$hour), ":00")


color="#003f5c"

ggplot(p3, aes(x=factor(hour), y=sum))+
  geom_point()+
  geom_smooth()+
scale_x_discrete(limits = c("13:00", "14:00", "15:00", "16:00", "17:00", "18:00", "19:00", "20:00", "21:00", "22:00", "23:00", "0:00", "1:00", "2:00", "3:00", "4:00", "5:00", "6:00", "7:00", "8:00", "9:00", "10:00", "11:00", "12:00"))+
  labs(title="Evolution of number of bike moves grouped by hour, for top 20 stations, \nJuly 14 - July 15", 
       caption="Source: Vélib API, Open Street Maps", 
       x="Hour", 
       y="Number of bike moves by hour")+
  theme_bw()+
theme(axis.text.x = element_text(size = 6))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

This suggests that the peak time to be getting a bike at these stations is at 23:00, though this likely undercounts the 22:00 hour, since data collection only started at 22:30.

Finally, we can also look at calculate usage metrics as a share of total bike capacity at each station

capacity_df <- dbGetQuery(db, "SELECT *
           FROM station_data 
           LEFT JOIN (SELECT *
                      FROM velib_change)
           ON station_id=change_station_id
           ")

capacity_df$usage <- capacity_df$docks_change/capacity_df$capacity*100

print(paste0("The correlation between number of bike changes and station usage is ", cor(capacity_df$bikes_change, capacity_df$usage, method="pearson", use="complete.obs")))
## [1] "The correlation between number of bike changes and station usage is 0.73914451139539"
ggmap(map)+
  geom_point(data=capacity_df, mapping=aes(x=lon, y=lat, color=usage))+
  scale_color_distiller(palette = "YlOrRd", direction = 1)+
  labs(title="Map of the Vélib stations by number of bike usage as a share of capacity, \nSat Jan 14 22:30- Sun Jan 15 1:00", 
       caption="Source: Vélib API, Open Street Maps", 
       color="Bike usage")+
  theme(axis.line=element_blank(),
      axis.text.x=element_blank(),
      axis.text.y=element_blank(),
      axis.ticks=element_blank(),
      axis.title.x=element_blank(),
      axis.title.y=element_blank())   
## Warning: Removed 100 rows containing missing values (`geom_point()`).

There is a high correlation between the two variables, but usage does suggest a slightly different picture, since some smaller stations may have been constrained by the number of bikes they could offer to riders. A number of stations experiencing their total capacity of bikes coming and going in just those 2.5 hours! They still tend to be in the center and towards the East, and you can even somewhat see the angle of the Grands Boulevards stations in the North.

I’ll conclude by using usage to see if there any trends by longitude or latitude.

ggplot(data=capacity_df, aes(x=lon, y=usage))+
  geom_point(aes(color=usage))+
  geom_smooth(color="#bc5090")+
  labs(title="Station usage by longitude, \nSat Jan 14 22:30- Sun Jan 15 1:00", 
       caption="Source: Vélib API, Open Street Maps", 
       color="Bike usage", 
       x="Longitude", 
       y="usage")+
  theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 47 rows containing missing values (`geom_point()`).

ggplot(data=capacity_df, aes(x=lat, y=usage))+
  geom_point(aes(color=usage))+
  coord_flip()+
  geom_smooth(color="#bc5090")+
  labs(title="Station usage by latitude, \nSat Jan 14 22:30- Sun Jan 15 1:00", 
       caption="Source: Vélib API, Open Street Maps", 
       color="Bike usage", 
       x="Latitude", 
       y="Usage")+
  theme_bw()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 47 rows containing non-finite values (`stat_smooth()`).
## Removed 47 rows containing missing values (`geom_point()`).

For both latitude and longitude, it is clear that central locations were more popular on Saturday night.

This project has therefore more or less confirmed priors about how centrality in Paris would affect usage of bikes. And thought our method of sampling likely undercounts total rides by only campling status changes every minute, it still finds a sizable number of trips conducted on Saturday night.

sum(velib_change$docks_change, na.rm=TRUE)
## [1] 187479

Indeed, the total number of bike changes was over 8000, which suggests at least 4000 trips were conducted over the two and a half hours, or more than 27 trips/minute.

#Looking just at 6 PM- AM